Uncovering Nutritional Trends among Popular Filipino Dishes

In [1]:
# For data prep
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import sqlite3
from sqlalchemy import create_engine

# For PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from scipy.spatial.distance import euclidean
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import math
import bisect

# For Clustering
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster

# For plotting
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotnine import *

# For writeup
from IPython.display import HTML
from IPython.core.display import HTML as Center
import warnings
warnings.filterwarnings('ignore')
In [2]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[2]:
In [3]:
proj_theme = theme(
    strip_background=element_rect(fill="white"),
    legend_title=element_blank(),
    legend_text=element_text(size=9),
    legend_position=(0.5, 0.93),
    legend_direction="horizontal",
    legend_box_background=(
        element_rect(fill="transparent", color="transparent")
    ),
    axis_text_x=element_text(size=10, color="#722502"),
    axis_text_y=element_text(size=10, color="#722502", face="bold"),
    text=element_text(family="sans", color="#722502", size=18),
    axis_title_x=(
        element_text(color='#DA4D2E', size=12, face="bold", margin={'t': 15})
    ),
    axis_title_y=(
        element_text(color='#DA4D2E', size=12, face="bold", margin={'r': 15})
    ),
    plot_title=(
        element_text(hjust=0.5, size=14, face="bold", margin={'b': 33})
    ),
    panel_spacing=0.5,
    panel_background=element_rect(fill="white", color="white"),
    plot_background=element_rect(fill="white"),
    panel_grid_major_x=element_line(colour="white"),
    panel_grid_major_y=element_line(colour="#cfa544", linetype="dashed"),
    panel_grid_minor=element_blank(),
    strip_text_x=(
        element_text(size=14, hjust=0.5, color="#722502", face="bold")
    )
)

proj_theme2 = theme(
    strip_background=element_rect(fill="white"),
    legend_title=element_blank(),
    legend_text=element_text(size=9),
    legend_position=(0.5, 0.93),
    legend_direction="horizontal",
    legend_box_background=(
        element_rect(fill="transparent", color="transparent")
    ),
    axis_text_x=element_text(size=10, color="#722502"),
    axis_text_y=element_text(size=10, color="#722502", face="bold"),
    text=element_text(family="sans", color="#722502", size=14),
    axis_title_x=(
        element_text(color='#DA4D2E', size=12, face="bold", margin={'t': 15})
    ),
    axis_title_y=(
        element_text(color='#DA4D2E', size=12, face="bold", margin={'r': 15})
    ),
    plot_title=(
        element_text(hjust=0.5, size=14, face="bold", margin={'b': 33})
    ),
    panel_spacing=0.5,
    panel_background=element_rect(fill="white", color="white"),
    plot_background=element_rect(fill="white"),
    panel_grid_major_x=element_line(colour="white"),
    panel_grid_major_y=element_line(colour="#cfa544", linetype="dashed"),
    panel_grid_minor=element_blank(),
    strip_text_x=(
        element_text(size=12, hjust=0.5, color="#722502", face="bold")
    )
)
In [4]:
def get_dish_urls():
    """Retrieve URLS for dishes under chicken, pork, and beef categories

    Returns
    -------
    dish_urls: dict
        Dictionary containing dish names as keys and URLs as values
    """
    # Limit main ingredients to three types of meat
    meats = ['chicken', 'pork', 'beef']

    # Prepare dish URLs holder
    dish_urls = {}

    # Loop through each meat types
    for meat in meats:

        # Get HTML for the category page
        response = requests.get(
            f'https://panlasangpinoy.com/{meat}-recipes/'
        )
        soup = BeautifulSoup(response.text)

        # Get number of pages
        pagination = soup.select_one(
            "div[class='archive-pagination pagination']"
        )
        last_page = pagination.select_one(
            "li[class='pagination-omission'] + li"
        )
        num_pages = int(last_page.select_one("a").contents[1])

        # Loop through each pages per meat type
        for n in np.arange(1, num_pages+1):
            response = requests.get(
                f'https://panlasangpinoy.com/{meat}-recipes/page/{n}/'
            )
            soup = BeautifulSoup(response.text)

            # Store dish URL
            for dish in soup.select("a[class='entry-title-link']"):
                dish_name = dish.contents[0]
                dish_urls[dish_name] = dish['href']

    print("Extraction of URLs done. Total number of dishes:", len(dish_urls))

    return dish_urls


def get_ingredients(dish_urls):
    """Get ingredients for all dishes

    Parameters
    ----------
    dish_urls: dict
        Dictionary containing dish names as keys and URLs as values

    Returns
    -------
    dish_dict: dict
        Dictionary containing dish names as keys and a dictionary of dish
        information and ingredients as their values
    """
    # Prepare dictionary
    dish_dict = {}

    # Get all dish information and ingredients per dish
    for key, url in dish_urls.items():
        print("Getting details for: ", key)
        response = requests.get(url)
        soup = BeautifulSoup(response.text)

        dish_details = {}

        try:
            dish_details['course'] = soup.select_one(
                "span[class='wprm-recipe-course "
                "wprm-block-text-normal']"
            ).contents[0]
        except AttributeError:
            dish_details['course'] = np.nan

        try:
            dish_details['cuisine'] = soup.select_one(
                "span[class='wprm-recipe-cuisine "
                "wprm-block-text-normal']"
            ).contents[0]
        except AttributeError:
            dish_details['cusine'] = np.nan

        try:
            dish_details['prep_time'] = soup.select_one(
                "span[class='wprm-recipe-details-label wprm-block-text-bold "
                "wprm-recipe-time-label wprm-recipe-prep-time-label'] + span"
            ).text
        except AttributeError:
            dish_details['prep_time'] = np.nan

        try:
            dish_details['cook_time'] = soup.select_one(
                "span[class='wprm-recipe-details-label wprm-block-text-bold "
                "wprm-recipe-time-label wprm-recipe-cook-time-label'] + span"
            ).text
        except AttributeError:
            dish_details['cook_time'] = np.nan

        try:
            dish_details['total_time'] = soup.select_one(
                "span[class='wprm-recipe-details-label wprm-block-text-bold "
                "wprm-recipe-time-label wprm-recipe-total-time-label'] + span"
            ).text
        except AttributeError:
            dish_details['total_time'] = np.nan

        try:
            servings = soup.select_one(
                "span[class='wprm-recipe-details-label wprm-block-text-bold "
                "wprm-recipe-servings-label']"
            ).next_sibling.text
            dish_details['servings'] = int(re.findall(r'\d+', servings)[0])
        except AttributeError:
            dish_details['servings'] = np.nan

        try:
            dish_details['calories'] = soup.select_one(
                "span[class='wprm-recipe-nutrition-with-unit']"
            ).text
        except AttributeError:
            dish_details['calories'] = np.nan

        # Loop through all ingredients
        for ing in soup.select("li[class='wprm-recipe-ingredient']"):
            ing_name = ing.select_one(
                "span[class='wprm-recipe-ingredient-name']"
            ).text
            try:
                str_amount = ing.select_one(
                    "span[class='wprm-recipe-ingredient-amount']"
                ).text
                str_unit = ing.select_one(
                    "span[class='wprm-recipe-ingredient-unit']"
                ).text
                # Determine total numerical weight
                dish_details[ing_name] = get_weight(str_amount, str_unit)
            except AttributeError:
                # Attempt to extract amount and unit from line
                dish_details[ing_name] = get_weight(ing_name, ing_name)

        dish_dict[key] = dish_details

    return dish_dict


def get_weight(str_amount, str_weight):
    """Transform different measurements into base units

    Parameters
    ----------
    str_amount: str
        Numeric amount

    str_weight: str
        Unit of measurement

    Returns
    -------
    weight: int
        Total calculated weight
    """
    amount = 0
    div_weight = 1

    # Set dictionary of vulgar fraction unicode characters
    vul_frac = {
        '¼':1/4,
        '½':.5,
        '¾':3/4,
        '⅐':1/7,
        '⅑':1/9,
        '⅒':.1,
        '⅓':1/3,
        '⅔':2/3,
        '⅕':.2,
        '⅖':.4,
        '⅗':.6,
        '⅘':.8,
        '⅙':1/6,
        '⅚':5/6,
        '⅛':1/8,
        '⅜':3/8,
        '⅝':5/8,
        '⅞':7/8
    }

    # Check for vulgar fraction characters
    for vul_char, frac in vul_frac.items():
        if str_amount.find(vul_char) != -1:
            amount += frac
            break

    # Check for typed fractions
    frac_list = re.findall(r'(\d+)/(\d+)', str_amount)
    if frac_list != []:
        num_frac, den_frac = frac_list[0]
        amount += int(num_frac) / int(den_frac)

    # Check for whole number
    whole_num_list = re.findall(r'(?<!/)(\d+)(?!/)', str_amount)
    if whole_num_list != []:
        amount += int(whole_num_list[0])

    # Add one if amount is detected
    if amount == 0:
        amount += 1

    # Set unit-value mappings
    unit_list = {
        'lb': 0.00220462,
        'bunch': 0.1,
        'liter': 0.236588,
        'quart': 0.25,
        'pinch': 0.355625,
        'pint': 0.5,
        'oz': 8,
        'ounce': 8,
        'thumb': 9.5,
        'tablespoon': 16,
        'tbsp': 16,
        'tsp': 48,
        'ml': 236.588
    }

    # Check for units
    for unit_name, unit_val in unit_list.items():
        if str_weight.find(unit_name) > -1:
            div_weight = unit_val

    # Compute weight
    weight = amount / div_weight

    return weight


def get_nutrition(dish_urls):
    """Get nutritional value for all dishes

    Parameters
    ----------
    dish_urls: dict
        Dictionary containing dish names as keys and URLs as values

    Returns
    -------
    nut_dict: dict
        Dictionary containing dish names as keys and a dictionary of nutrients
        as their values
    """
    # Prepare dictionary
    nut_dict = {}

    # Retrieve all nutrients per dish
    for key, url in dish_urls.items():
        print("Getting details for: ", key)
        response = requests.get(url)
        soup = BeautifulSoup(response.text)

        nut_details = {}

        # Get servings
        try:
            servings = soup.select_one(
                "span[class='wprm-recipe-details-label wprm-block-text-bold "
                "wprm-recipe-servings-label']"
            ).next_sibling.text
            nut_details['Serving'] = int(re.findall(r'\d+', servings)[0])
        except AttributeError:
            nut_details['Serving'] = np.nan

        # Loop through all nutrients
        for nut in soup.select(
            "span[class='wprm-nutrition-label-text-nutrition-container']"
        ):
            nut_name = nut.select_one(
                "span[class='wprm-nutrition-label-text-nutrition-label "
                "wprm-block-text-normal']"
            ).text

            nut_name = re.findall(r'(\w+):', nut_name)[0]

            nut_details[nut_name] = nut.select_one(
                "span[class='wprm-nutrition-label-text-nutrition-value']"
            ).text

        nut_dict[key] = nut_details

    return nut_dict


def prepare_df(dish_dict):
    '''Create DataFrame out of dish dictionary

    Parameters
    ----------
    dish_dict: dict
        Dictionary containing dish names as keys and a dictionary of dish
        information and ingredients as their values

    Returns
    -------
    df_new: DataFrame
        DataFrame with rows representing dishes and columns representing
        dish information and ingredients
    '''
    # Prepare DataFrame
    df_ing = pd.DataFrame.from_dict(dish_dict).T
    # Retain Filipino dishes
    df_ing = df_ing[df_ing.cuisine.str.contains('Filipino', na=False)]
    # Drop irrelevant columns
    df_ing = df_ing.drop(columns=["course", "cuisine", "prep_time",
                                  "cook_time", "total_time", "servings",
                                  "calories"])
    # Sort columns and drop empty columns
    df_ing = df_ing.sort_index(axis=1)
    df_ing = df_ing.dropna(axis=1, how='all')
    df_ing.fillna(0)

    # Prepare ingredient-keyword mappings
    cat_ing_dict = {
        'yeast': r'(yeast)',
        'wrapper': r'(wrapper)',
        'worcestershire_sauce': r'(worcestershire)',
        'winged_bean': r'(winged bean)',
        'vinegar': r'(vinegar)',
        'turmeric': r'(tumeric)',
        'tomato': r'(tomato)',
        'tofu': r'(tofu)',
        'toasted_rice_powder': r'(toasted rice powder)',
        'taro': r'(taro)',
        'sweet_potato': r'(sweet potato)',
        'sugar': r'(sugar)',
        'star_anise': r'(star anise)',
        'squash': r'(kalabasa|squash)',
        'soy_sauce': r'(soy sauce)',
        'sinigang_mix': r'(sinigang)',
        'shortening': r'(shortening)',
        'sesame_oil': r'(sesame oil)',
        'scallion': r'(scallion)',
        'sayote': r'(sayote)',
        'salted_egg': r'(salted egg)',
        'safflower_oil': r'(safflower oil)',
        'raisins': r'(raisins)',
        'radish': r'(radish)',
        'potato': r'(potato)',
        'pork_insides': (
            r'(pig’s liver|pig’s heart|pig’s small intestine|'
            r'bung|pig cheeks|pig heart|pig kidney|pig stomach|'
            r'pork ears|pork large intenstine|pork liver|small intestine)'
        ),
        'pork_fat': r'(pork fat)',
        'pork_stock': r'(pork broth|pork stock|pork cube)',
        'pork_blood': r'(pork blood)',
        'pork_and_beans': r'(pork and beans)',
        'pie_crust': r'(pie)',
        'pickle': r'(pickle|relish)',
        'pepper_leaf': r'(pepper leaves)',
        'pechay': r'(pechay)',
        'peanut': r'(peanut)',
        'patola': r'(patola)',
        'parsley': r'(parsley)',
        'paprika': r'(paprika)',
        'papaya': r'(papaya)',
        'oyster_sauce': r'(oyster)',
        'onion': r'(onion)',
        'olive_oil': r'(olive oil)',
        'olive': r'(olive)',
        'okra': r'(okra)',
        'nutmeg': r'(nutmeg)',
        'noodle': r'(noodle|pancit|sotanghon|misua|miswa)',
        'mushroom': r'(mushroom)',
        'munggo': r'(mung)',
        'mirin': r'(mirin)',
        'mayonnaise': r'(mayonnaise)',
        'malunggay': r'(malunggay)',
        'liver_spread': r'(liver)',
        'lemongrass': r'(lemongrass)',
        'leeks': r'(leeks)',
        'lechon_sauce': r'(lechon)',
        'kasubha': r'(kasubha)',
        'kangkong': r'(spinach|kangkong)',
        'jicama': r'(jicama)',
        'jackfruit': r'(jackfruit)',
        'ice': r'(ice)',
        'hotdog': r'(hotdog)',
        'hot_sauce': r'(hot)',
        'honey': r'(honey)',
        'hoisin_sauce': r'(hoisin)',
        'green_pea': r'(green pea|pigeon pea)',
        'green_bean': (
            r'(green beans|sitaw|snake beans|string beans|snap pea|snow pea)'
        ),
        'glutinous_rice': r'(glutinous rice)',
        'ginger': r'(ginger)',
        'ginataang_gulay_mix': r'(ginataang)',
        'garlic': r'(garlic)',
        'flour': r'(flour)',
        'eggplant': r'(eggplant|talong)',
        'egg': r'(egg)',
        'curry_powder': r'(curry)',
        'cucumber': r'(cucumber)',
        'cream': r'(cream)',
        'cooking_wine': r'(wine)',
        'cooking_oil': r'(cooking oil|vegetable oil)',
        'coconut_water': r'(coconut water)',
        'coconut_milk': r'(coconut cream|coconut milk)',
        'coconut_meat': r'(coconut meat)',
        'clear_soda': r'(7-up|sprite|clear softdrink)',
        'cinnamon': r'(cinnamon)',
        'sausage': r'(chinese sauage|chorizo)',
        'chicken_stock': r'(chicken broth|chicken cube)',
        'chicharon': r'(chicharon)',
        'cheese': r'(cheese)',
        'celery': r'(celery)',
        'carrot': r'(carrot)',
        'canned_meat': r'(potted meat|luncheon meat)',
        'calamansi': r'(calamansi|lemon|lime)',
        'cabbage': r'(cabbage)',
        'butter': r'(butter|margarine)',
        'broccoli': r'(broccoli)',
        'bread': r'(bread)',
        'bok_choy': r'(bok choy|bokchoy)',
        'black_soda': r'(coke|cola)',
        'black_bean': r'(black bean)',
        'beer': r'(beer)',
        'beef_insides': (
            r'(lard|cow|beef heart|beef kidney|beef large instestine|'
            r'beef liver|beef neck bone|beef small intestine|bile|tripe|'
            r'tongue|tripe|lengua)'
        ),
        'beef_stock': (
            r'(beef cube|beef bouillon|bulalo|beef broth|beef stock)'
        ),
        'bay_leaf': r'(bay)',
        'bamboo_shoots': r'(bamboo shoots)',
        'baking_powder': r'(baking powder)',
        'annatto': r'(annatto)',
        'ampalaya': r'(ampalaya)',
        'adobo_sauce': r'(adobo)',
        'achiote': r'(achiote)',
        'tomato_liquid': (
            r'(ketchup|tomato sauce|tomato paste|spaghetti sauce)'
        ),
        'banana_flower': r'(blossom)',
        'pepper': r'(white pepper|black pepper|crushed pepper|peppercorn)',
        'chili': (
            r'(chili|pepper flakes|serrano pepper|sili|jalapeno|'
            r'ghost pepper|green pepper)'
        ),
        'bell_pepper': r'(bell pepper)',
        'bagoong': r'(alamang|shrimp paste|balaw)',
        'liquid_seasoning': r'(liquid seasoning|savorrich|marinade)',
        'chickpea': r'(chick pea|garbanzos)',
        'chicken_insides': r'(chicken gizzard|chicken hear|chicken liver)',
        'cornstarch': r'(cornstarch)',
        'corned_beef': r'(corned beef)',
        'fish_sauce': r'(fish sauce)',
        'pineapple_juice': r'(pineapple juice|juice from the canned tidbits)',
        'shrimp_cube': r'(shrimp cube)',
        'watermelon': r'(watermelon)',
        'milk': r'(milk)',
        'pea': r'(pea)',
        'pasta': r'(spaghetti|macaroni)',
        'shrimp': r'(shrimp)',
        'pineapple': r'(pineapple)',
        'water': r'(water)',
        'salt': r'(salt)',
        'rice': r'(rice|sinangag)',
        'pork': r'(pork|pig|lechon)',
        'corn': r'(corn)',
        'chicken': r'(chicken)',
        'beef': r'(beef|steak|oxtail|ox tail|sirloin|bistek)',
        'banana': r'(banana|plantain)'
    }

    # Create new DataFrame
    df_new = pd.DataFrame(index=df_ing.index)

    # Loop through the categories
    for cat_name, cat_regex in cat_ing_dict.items():
        # Look for columns that contains the query
        ing_filter = (
            df_ing.columns.to_series()
            .str.contains(cat_regex, case=False, regex=True)
        )

        # Filter columns that satisfy the query
        filter_cols = ing_filter[ing_filter].index

        print(f"Columns obtained for {cat_name}:", filter_cols)

        # Add columns to the new DataFrame
        df_new[cat_name] = df_ing[filter_cols].sum(axis=1)

        # Drop columns to prevent reusing of ingredients
        df_ing.drop(columns=filter_cols, inplace=True)

    df_new = df_new.rename_axis('dish_name').reset_index()
    return df_new


def prepare_nut_df(nut_dict, dish_idx):
    '''Create DataFrame out of nutrition dictionary

    Parameters
    ----------
    nut_dict: dict
        Dictionary containing dish names as keys and a dictionary of nutrients
        as their values

    dish_idx: list
        Indeces from Ingredient Information DataFrame

    Returns
    -------
    df_nut: DataFrame
        DataFrame with rows representing dishes and columns representing
        nutrients
    '''
    # Prepare DataFrame
    df_nut = pd.DataFrame.from_dict(nut_dict).T
    # Retain dishes from Ingredient Information DataFrame
    df_nut = df_nut.loc[dish_idx]
    df_nut.fillna(0, inplace=True)
    df_nut = df_nut.rename_axis('dish_name').reset_index()
    return df_nut


def export_sql(df_dish, df_nut):
    """Export DataFrame to SQL file"""
    # Open connection to DB file
    conn = sqlite3.connect('ulam_nut.db')

    # Transform DataFrame into DB table
    df_dish.to_sql('rekado', con=conn, if_exists='replace', index=False)
    df_nut.to_sql('nutrition', con=conn, if_exists='replace', index=False)

    # Close connection
    conn.close()


def retrieve_data():
    '''Get tables from database

    Returns
    -------
    df_ing, df_nut: DataFrame
        DataFrames representing ingredients and nutrional value, respectively
    '''
    with create_engine('sqlite:///ulam_nut.db').connect() as conn:
        # Read ingredients table
        df_ing = pd.read_sql("""
        SELECT * FROM rekado
        """, conn)

        #Read nutrition table
        df_nut = pd.read_sql("""
        SELECT * FROM nutrition
        """, conn)

    return df_ing, df_nut


def drop_features(data, irrelevant_cols, skip_cols, p):
    """
    Drop features given a variance threshold

    Parameters
    ----------
    data : dataframe
        dataframe from which the columns will be dropped. May contain
    a mix of numeric and categorical columns. By default, categorical
    columns are label-encoded before getting the variance of the columns.

    irrelevant_cols : list
        list of column names. Column names in this list will be dropped
    permanently.

    skip_cols : list
        list of column names. Column names in this list will be kept by
    default and will not undergo variance thresholding.

    p : float
        variance threshold for dropping. A p = 0.01 would mean dropping
    columns where 99% of the values are similar.

    Returns
    -------
        dataframe in its original format, whose columns that did not
    satisfy the p threshold were dropped

    """
    orig_data = data.copy(deep=True)
    if irrelevant_cols is not None:
        data = orig_data.drop(irrelevant_cols, axis=1)
    if skip_cols is not None:
        data = data.drop(skip_cols, axis=1)

    num_data = pd.DataFrame(data.select_dtypes(include=np.number))
    cat_data = pd.DataFrame(data.select_dtypes(exclude=np.number))

    if cat_data.shape[1] != 0:
        cat_data = cat_data.apply(LabelEncoder().fit_transform)
        transformed_data = pd.concat([num_data, cat_data], axis=1)
    else:
        transformed_data = num_data

    # threshold=0.01 means dropping the column where 99% of values are similar.
    thresh = VarianceThreshold(threshold=p)
    thresh.fit_transform(transformed_data)
    ind = thresh.get_support(indices=True)
    cols_kept = list(transformed_data.columns[ind])
    cols_dropped = list(set(transformed_data.columns)-set(cols_kept))

    if skip_cols is not None:
        return orig_data[skip_cols + cols_kept]

    return orig_data[cols_kept]


def fix_dtypes(data):
    """Fix datatypes of the dataframe

    Parameters
    ----------
    data : dataframe
        dataframe from which the columns will be appropriately
    converted into their correct data types. May contain a mix of numeric
    and categorical columns.

    """
    df = data.copy(deep=True)
    df = df.convert_dtypes(
        infer_objects=False, convert_string=False, convert_floating=True
    )
    for i in df.columns:
        if '_dt' in i:
            df[i] = pd.to_datetime(df[i], errors='coerce')
    return df


def manual_fix_dtypes(data, float_cols):
    """Fix datatypes of the dataframe

    Parameters
    ----------
    data : dataframe
        dataframe from which the columns will be appropriately
    converted into their correct data types. May contain a mix of numeric
    and categorical columns.

    float_cols : list
        List of columns to be converted to float
    """
    df = data.copy(deep=True)
    df[float_cols] = df[float_cols].apply(pd.to_numeric, errors='coerce')
    for i in df.columns:
        if '_dt' in i:
            df[i] = pd.to_datetime(df[i], errors='coerce')
    return df

def truncated_svd(X, thresh=0.90):
    """Perform singular value decomposition on a design matrix X

    Parameters
    ----------
    X : array
        Matrix of numbers to decompose.

    thresh : float
        A number between 0 to 1 that serves as the cut-off for choosing
    the number of SV components to keep.
    """
    q, s, p = np.linalg.svd(X, full_matrices=True)
    Q = q
    S = np.diag(s)
    P = p.T
    NSSD = (s / np.sqrt(np.sum(s**2)))**2

    ind = bisect.bisect(NSSD.cumsum(), thresh) + 1

    return Q, S, P, NSSD


def project_svd(q, s, k):
    """Project the design matrix on to the first k singular vectors

    Parameters
    ----------
    q : array
        Array of SV loadings.

    s : array
        Array og variance explained.

    k : int
        Number of components to display.
    """
    return q[:, :k].dot(s[:k, :k])


def plot_svd_ulam(data, num_comp, num_ing, fill_fn, manual_fill_values):
    """Plot the SV components

    Plots SV components but allows the user to zoom in on certain loadings
    for easier interpretation.

    Parameters
    ----------
    data : dataframe
        pandas dataframe whose columns are the are SV components. The rows
    should contain the loadings for each feature.

    zoom_on : ['dominant', 'close to zero']
        The 'dominant' shows only the top num_ing and bottom num_ing of the
    features based on their loadings, whereas 'close to zero' shows only
    the num_ing features whose loadings are close to 0.

    num_comp : int
        The order or number of SV component to plot.

    num_ing : int
        Number of features in the SV to show. Affects the zoom_on parameter.

    fill_fn : function
        Function that maps the loading value to a color.

    manual_fill_values : list
        List of HEX colors found in fill_fn.

    """
    pc = data.iloc[:, (num_comp-1)].reset_index()
    pc.columns = ['ing', 'loading']
    pc['abs_loading'] = np.abs(pc['loading'])
    pc = pc.sort_values('abs_loading', ascending=False).head(num_ing)
    pc['ing'] = pc['ing'].str.replace('_', ' ').str.title()
    pc['ing'] = pd.Categorical(pc['ing'], categories=pc['ing'], ordered=True)
    pc['ing_color'] = fill_fn(pc['loading'])

    p = (
        ggplot(pc, aes(x='ing', y='loading')) +
        geom_bar(aes(fill='ing_color'), stat='identity', show_legend=False) +
        scale_fill_manual(values=manual_fill_values) +
        xlab('') +
        ylab('') +
        proj_theme
    )

    return p


def plot_svd_zoomed(data, zoom_on,
                    num_comp, num_ing,
                    fill_fn, manual_fill_values):
    """Plot the SV components

    Plots SV components but allows the user to zoom in on certain loadings
    for easier interpretation.

    Parameters
    ----------
    data : dataframe
        pandas dataframe whose columns are the are SV components. The rows
    should contain the loadings for each feature.

    zoom_on : ['dominant', 'close to zero']
        The 'dominant' shows only the top num_ing and bottom num_ing of the
    features based on their loadings, whereas 'close to zero' shows only
    the num_ing features whose loadings are close to 0.

    num_comp : int
        The order or number of SV component to plot.

    num_ing : int
        Number of features in the SV to show. Affects the zoom_on parameter.

    fill_fn : function
        Function that maps the loading value to a color.

    manual_fill_values : list
        List of HEX colors found in fill_fn.
    """
    pc = data.iloc[:, (num_comp-1)].sort_values().reset_index()
    pc.columns = ['ing', 'loading']

    if zoom_on == 'dominant':
        inds = (
            list(range(0, num_ing)) +
            list(range(pc.shape[0]-num_ing, pc.shape[0]))
        )
        pc = pc[pc.index.isin(inds)]
    elif zoom_on == 'close to zero':
        min_dist = min(np.abs(pc['loading'] - 0))
        min_ind = pc[np.abs(pc['loading']) == min_dist].index[0]
        inds = (
            list(range(min_ind-num_ing, min_ind-5)) +
            list(range(min_ind+5, min_ind+num_ing))
        )
        pc = pc[pc.index.isin(inds)]

    pc['ing'] = pc['ing'].str.replace('_', ' ').str.title()
    pc['ing'] = pd.Categorical(pc['ing'], categories=pc['ing'], ordered=True)
    pc['ing_color'] = fill_fn(pc['loading'])

    p = (
        ggplot(pc, aes(x='ing', y='loading')) +
        geom_bar(aes(fill='ing_color'), stat='identity', show_legend=False) +
        scale_fill_manual(values=manual_fill_values) +
        coord_flip() +
        xlab('') +
        ylab('') +
        proj_theme
    )
    p.save('SV'+str(num_comp)+'_'+zoom_on+'.png', width=8, height=6)


def final_clustering(transformed_arr,
                     orig_df,
                     method,
                     threshold,
                     plot_threshold,
                     break_biggest_cluster=False,
                     link_colors=['#5594BA',
                                  '#EFC564',
                                  '#DA4D2E',
                                  '#722502',
                                  '#EFC564',
                                  '#B3C55A']):
    """Perform modified hierarchical clustering

    Performs hierarchical clustering on the given matrix. Allows the user to
    break the biggest clusters so they do not have to perform clustering
    again.

    Parameters
    ----------
    transformed_arr : array
        Matrix to be placed in the clustering algorithm

    orig_df : dataframe
        Dataframe to append the predicted cluster groups to.

    method : string
        The method parameter of heirarchy.linkage.

    threshold : float
        Threshold to determine the number of clusters.

    plot_threshold : float
        Threshold to fix the dendrogram. For plotting purposes only.

    break_biggest_cluster : bool
        Determines whether the biggest cluster will be further broken down.

    link_colors : list
        List of HEX colors for the dendrogram plot.
    """

    Z = hierarchy.linkage(
        transformed_arr, method=method, optimal_ordering=True
    )
    hierarchy.set_link_color_palette(link_colors)
    y_pred = fcluster(Z, t=threshold, criterion='distance')

    if break_biggest_cluster:
        X_cluster = orig_df.copy()
        X_cluster['cluster_no'] = y_pred

        get_big_cluster = X_cluster.groupby(['cluster_no']).size().idxmax()
        X_1 = (
            X_cluster[X_cluster['cluster_no'] == get_big_cluster]
            .drop(columns=['cluster_no'])
        )
        Z_1 = linkage(X_1, method='ward', optimal_ordering=True)
        Z = Z_1

    fig, ax = plt.subplots(figsize=(8, 5), dpi=100)

    fig.patch.set_facecolor('white')
    fig.patch.set_alpha(0.6)
    ax.patch.set_facecolor('white')
    ax.patch.set_alpha(0.0)

    ax.spines['bottom'].set_color('#722502')
    ax.spines['top'].set_color('#722502')
    ax.spines['right'].set_color('#722502')
    ax.spines['left'].set_color('#722502')
    ax.tick_params(axis='x', colors='#722502')
    ax.tick_params(axis='y', colors='#722502')
    ax.set_ylabel(r'$\Delta$')

    dn = dendrogram(Z, ax=ax, p=plot_threshold, truncate_mode='level')

    return Z


def plotly_clusters(data, y_ref, x_ref, c_map):
    """Plot the nutrition clustering results on the ingredient SV components

    Projects the dishes on the ingredient SV components and colors them by
    their nutrition clustering results. This allows the user to associate
    the natural tendencies of the dishes' ingredients and to the
    natural clustering of the dishes' nutritional content.

    Parameters
    ----------
    data : dataframe
        pandas dataframe consisting of the ingredient SVs, original (scaled)
    nutrition information, along with the cluster numbers, cluster names, and
    the desired cluster colors.

    y_ref : float
        y intercept to be plotted.

    x_ref : float
        x intercept to be plotted.

    c_map : dict
        Dictionary that maps the cluster names to a HEX color.

    """

    fig = px.scatter(data.sort_values('cluster_names'),
                     x='SV2',
                     y='SV5',
                     color='cluster_names',
                     hover_data=['dish_name'],
                     labels={
                         'SV2': '<b>Meat and Vegetable Range</b>',
                         'SV5': '<b>Flavor Range</b>',
                         'cluster_names': '<b>Cluster</b>',
                         'dish_name': '<b>Dish Name</b>'},
                     color_discrete_map=cmap,
                     width=1000, height=800)
    fig.update_traces(marker={'size': 14})
    fig.for_each_trace(
        lambda t: t.update(textfont_color='#99D072', textposition='top right')
    )
    fig.add_shape(
        yref='y',
        y0=y_ref,
        y1=y_ref,   # adding a horizontal line at Y = 1
        xref='paper',
        x0=0,
        x1=1,
        line=dict(
            color='rgba(114, 37, 2, 0.5)',
            width=2.5)
    )
    fig.add_shape(
        type='line',
        yref='paper',
        y0=0,
        y1=1,
        xref='x',
        x0=x_ref,
        x1=x_ref,
        line=dict(
            color='rgba(114, 37, 2, 0.5)',
            width=2.5)
    )
    fig.update_layout(
        title={
               'xanchor': 'center',
               'yanchor': 'top',
               'x': 0.5},
        margin=dict(l=0, r=0, t=0, b=0),
        font={'size': 20, 'color': '#722502'},
        plot_bgcolor='white',
        xaxis_range=[-1, 1],
        yaxis_range=[-1, 1],
        paper_bgcolor='white'
    )
    fig.update_xaxes(
        showline=True, linewidth=2, linecolor='#722502', gridcolor='#F7E2B1'
    )
    fig.update_yaxes(
        showline=True, linewidth=2, linecolor='#722502', gridcolor='#F7E2B1'
    )
    fig.show()


def plot_nutrients(data, cluster_name):
    """Plot the nutrient contents of each cluster

    Plots the median nutritient contents of the cluster by nutrient kind.

    Parameters
    ----------
    data : dataframe
        pandas dataframe consisting of the median nutrient content by cluster
    name.

    cluster_name : str
        Cluster name whose nutrient contents are to be plotted.
    """

    cluster_bar_df = data.loc[cluster_name].reset_index()
    cluster_bar_df.columns = ['Nutrient', 'Value']
    cluster_bar_df['Value'] = (
        np.where(
            cluster_bar_df['Nutrient'].isin(['Vitamin A']),
            cluster_bar_df['Value']*0.6/1000,
            cluster_bar_df['Value']
        )
    )
    cluster_bar_df['Nutrient'] = (
        pd.Categorical(
            cluster_bar_df['Nutrient'],
            categories=['Carbohydrates', 'Sugar', 'Fiber',
                        'Protein', 'Cholesterol', 'Fat',
                        'Vitamin A', 'Vitamin C',
                        'Sodium', 'Potassium', 'Calcium', 'Iron'],
            ordered=True
        )
    )

    def custom_names(x):
        """Custom color for the plot at hand"""

        # High Cholesterol, High Sodium
        if x in ['Sugar', 'Fiber', 'Carbohydrates']:
            return 'Carbs (g)'
        # High Protein & Less Fat, High Vitamins & Fiber
        elif x in ['Fat', 'Cholesterol']:
            return 'Lipids (mg)'
        # Bland and low vitamins and minerals
        elif x in ['Protein']:
            return 'Proteins (g)'
        # Bland but high vitamins and minerals
        elif x in ['Vitamin A', 'Vitamin C']:
            return 'Vitamins (mg)'
        elif x in ['Iron', 'Sodium', 'Calcium', 'Potassium']:
            return 'Minerals (g)'

    func = np.vectorize(custom_names)
    cluster_bar_df['Nutrient Group'] = func(cluster_bar_df['Nutrient'])
    cluster_bar_df['Nutrient Group'] = (
        pd.Categorical(
            cluster_bar_df['Nutrient Group'],
            categories=['Carbs (g)', 'Proteins (g)', 'Lipids (mg)',
                        'Vitamins (mg)', 'Minerals (g)'],
            ordered=True
        )
    )

    def custom_colors(x):
        """Custom color for the plot at hand"""

        # High Cholesterol, High Sodium
        if x in ['Carbohydrates', 'Sugar', 'Fiber']:
            return '#EFC564'
        # High Protein & Less Fat, High Vitamins & Fiber
        elif x in ['Fat', 'Cholesterol']:
            return '#DA4D2E'
        # Bland and low vitamins and minerals
        elif x in ['Protein']:
            return '#722502'
        # Bland but high vitamins and minerals
        elif x in ['Vitamin A', 'Vitamin C']:
            return '#F4903E'
        elif x in ['Calcium', 'Potassium', 'Sodium', 'Iron']:
            return '#8D9F38'

    func = np.vectorize(custom_colors)
    cluster_bar_df['nutrient_colors'] = func(cluster_bar_df['Nutrient'])

    p = (
        ggplot(cluster_bar_df, aes(x='Nutrient', y='Value')) +
        geom_bar(
            aes(fill='Nutrient Group'),
            stat='identity',
            width=0.6,
            position='dodge',
            show_legend=False
        ) +
        scale_fill_manual(
            values=['#EFC564', '#722502', '#DA4D2E', '#F4903E', '#8D9F38']
        ) +
        coord_flip() +
        facet_wrap('Nutrient Group', scales='free', ncol=1) +
        xlab('') +
        ylab('') +
        geom_blank(aes(y=100)) +
        proj_theme2
    )

    p.save(cluster_name+'.png', width=5, height=10)

Executive Summary

The combination of flavors and ingredients in Filipino cuisine has come a long way: from being a brainchild of Western and Asian cooking techniques into becoming a distinct style that we Filipinos love and appreciate to this very date. However, this poses a cliché question: are the things we love truly good for us? With the recent emergence of health concerns, Filipinos are becoming more careful, especially with regards to the food they eat.

This study aims to discover the groupings of Filipino dishes according to their nutritional value and see how the dishes’ ingredients contribute to such groupings. Given this overarching problem, this study used two main data sources: the Ingredient Dataset and the Nutritional Value Dataset which were scraped and compiled from the Panlasang Pinoy website. The ingredients list was used to identify the natural combinations of Filipino ingredients, while information about the dishes' nutritional value was used in creating the nutrition-based clusters. Out of all the clustering methods explored, Ward's provided the best visual and domain knowledge interpretation while still retaining parsimony. Using this methodology, this study offers two sets of findings: (1) the nutrient-based clusters of Filipino dishes and their corresponding interpretation, and (2) the nutrient-based clusters projected on the ingredient space. This allows us to not only understand the underlying nutritional trends in our dishes but also identify healthier alternative ingredients for otherwise unhealthy dishes.

The team discovered four nutrient-based clusters of Filipino dishes: a Typical Filipino Meal cluster, characterized by strong taste and absence of vegetables that led to a deficiency in vitamins and minerals; an Upgraded Filipino Meal cluster, which contains improved amounts of vitamins that is driven by the presence of tomatoes; a Super Filipino Meal cluster, which is considered to be the healthiest out of all clusters due to the recognizable amount of fruits and vegetables, and a Fatty and Salty Filipino Meal, characterized by high amounts of sodium and cholesterol. When projected on the ingredient components, almost all clusters are clumped together in the sweet and salty flavor range and are spread across the meat and vegetable range. This suggests that regardless of the combination of meat and vegetables found in a Filipino dish, as long as it is in the sweet and salty flavor range, the dish will almost always fall under the Typical Filipino Meal, the Upgraded Filipino Meal, or the Fatty and Salty Filipino Meal. The more the dishes make use of pork and garlic combination instead of the lean meat and crops combination, the more likely it is to belong to the Fatty and Salty cluster. However, if this pork and garlic combination is in a nutty sauce that is accompanied by vitamin-enriched vegetables, then they are likely to belong to the Super Filipino Meal.

Aside from the intent of providing use cases that utilize the insights from ingredient-based clusters, the team suggests having a balanced focus on nutrition as well. With nutrition-based clustering, business owners and passionate chefs can craft an entirely different menu that gives priority to nutrition or find innovative ways to turn unhealthy dishes into superfoods. For future studies, the team recommends looking at various recipes from other cookbooks and restaurants that could help provide better results.

Introduction

Background

We, Filipinos, love food; there is no doubt about this. The richness of flavor in Filipino Cuisine is one of a kind and truly phenomenal. However, can we say the same about its nutritional value? Is the world-famous Adobo healthy? What about our Mechado or Afritada?

Contrary to most of our Southeast Asian neighbors, Filipino dishes usually shy away from using herbs and are inclined to use unique ingredients like offal, gizzard, and chicken intestines. Apart from having a variety of ingredients, the style of making Filipino dishes pulls a lot from Spanish and American influences, as well as Asian influences such as Chinese and Indonesian [1]. Amidst the boldness and creativity found in our dishes, we wonder if its nutritional value is as commendable and as its flavors.

As the years have gone by, people are becoming more health-conscious than ever, especially now because of the ongoing pandemic [2]. While practicing consciousness about our health is easy whenever we go to a restaurant or call for food delivery, there is little literature yet about how healthy, in general, homecooked Filipino dishes are. Variations of a Filipino dish grow multiplicatively, while the Filipino mass remains ignorant of the source's nutritional content. The researchers suspect that because of this, the orientation for innovation and reinvention in the Filipino cuisine generally pointed towards experimenting with flavors and textures but not towards finding healthier alternatives.

This challenge served as the group's inspiration to study the nutritional patterns of popular Filipino dishes as the first step towards helping the Filipinos be more intentional with their cooking and eating choices while still enjoying good food. In this study, we will uncover not only insights about the dishes' overall nutrient content, but also identify ingredient combinations that are likely to be healthier, and thus could serve as a starting point in creating healthier dish variants. The researchers hope that this study will encourage Filipinos to cook and consume healthier menus by identifying unhealthy dishes while providing alternatives, and re-orienting innovation and reinvention of traditional Filipino recipes towards a healthier track.

Problem Statement

How do the Filipino dishes look like from a nutrition-based standpoint?

  1. How do the nutrition-based clusters describe the Filipino dishes based on the following nutrient content?
    • Carbs (carbohydrates, fiber, sugar)
    • Lipids (fat, cholesterol)
    • Proteins (protein)
    • Vitamins (vitamin A, vitamin C)
    • Minerals (calcium, potassium)
  1. Are there any suggestive patterns about the nutritional content vis-a-vis the ingredients used in the dishes? (e.g., Does pork and soy sauce combination generally result in unhealthy dishes?)
  1. How do the variants of the same dish compare with regards to their nutritional content? (e.g., Are some Adobo variants healthier than the others?)

Data Sources and Description

The study used the recipes from Panlasang Pinoy; a Filipino food blog created by Mr. Vanjo Merano to showcase his passion for cooking and promote Filipino Cuisine to the rest of the world. Each Filipino dish page is presented in a blog-like fashion, complete with a narrative, cooking tips, and of course the recipe itself. The recipe contains brief information about the dish, cooking utensils and equipment needed, ingredients, and instructions (Figure 1). Some recipes are also accompanied by their estimated nutritional value based on the said ingredients. Compared to other sources, the units of both the ingredients and nutritional content in this website are relatively more standardized.

Figure 1. A Sample Recipe from Panlasang Pinoy

Only the relevant information from the recipe, such as the serving size, ingredients list, and nutrition content of the recipe was scraped. The ingredients list was used to identify the natural combinations of Filipino ingredients, while the nutrition content was used in creating the nutrition-based clusters. While information about the ingredients was not used in clustering per se, they were vital in interpreting the clustering results, particularly in understanding whether a combination of ingredients results in healthy or unhealthy dishes.

Ingredient Dataset

The ingredient lists for the different Filipino dishes were compiled into the Ingredient Dataset. Its rows corresponds to a dish, while each of its columns correspond to an ingredient. Ingredients that do not appear in the dish were assigned a value of 0. Further, ingredients that represent almost the same flavor and texture were collapsed (e.g., black pepper and peppercorn were collapsed into pepper). Table 1 below lists down all the ingredients that can be found in the dataset, grouped according to their kind.


Table 1. Column Names of the Ingredient Information Dataset
Meat
Grains, Fruits, and Veggies
Base Liquids
Herbs, Spices, and Condiments
beef_insides ampalaya beef_stock achiote
beef bamboo_shoots beer adobo_sauce
canned_meat banana black_soda annatto
chicharon bell_pepper chicken_stock bagoong
chicken_insides black_bean clear_soda baking_powder
chicken bok_choy coconut_milk banana_flower
corned_beef bread ice bay_leaf
egg broccoli milk butter
hotdog cabbage pineapple_juice calamansi
pork_and_beans carrot pork_stock cheese
pork_blood celery tomato_liquid chili
pork_fat chickpea water cinnamon
pork_insides coconut_meat coconut_water
pork corn cooking_oil
salted_egg cucumber cooking_wine
sausage eggplant cornstarch
shrimp glutinous_rice cream
green_bean curry_powder
green_pea fish_sauce
jackfruit flour
jicama garlic
kangkong ginataang_gulay_mix
malunggay ginger
munggo hoisin_sauce
mushroom honey
noodle hot_sauce
okra kasubha
olive lechon_sauce
papaya leeks
pasta lemongrass
patola liquid_seasoning
pea liver_spread
peanut mayonnaise
pechay mirin
pepper_leaf nutmeg
pickle olive_oil
pie_crust onion
pineapple oyster_sauce
potato paprika
radish parsley
raisins pepper
rice safflower_oil
sayote salt
squash scallion
sweet_potato sesame_oil
taro shortening
tofu shrimp_cube
tomato sinigang_mix
watermelon soy_sauce
winged_bean star_anise
wrapper sugar
toasted_rice_powder
turmeric
vinegar
worcestershire_sauce
yeast


Finally, to allow better comparability and interpretation, the unit of measurements underwent a two-step scaling method (see Data Preparation for more details). In this way, all dry ingredients in the dataset are measured in grams, all wet ingredients are measured in cups, and all ingredients that come in packs and bundles (e.g., dozen) are measured in an individual count of pieces.

Nutritional Value Dataset

The nutritional content for the different Filipino dishes was compiled into the Nutritional Value Dataset. Its rows correspond to a dish, while each of its columns corresponds to the recipe's suggested serving size, along with its different nutrient contents: carbohydrates, fiber, sugar, fat, cholesterol, protein, vitamin A, vitamin C, calcium, and potassium (Table 2). Since all units of measurement for the nutrients are the same throughout the website, only a minimal standardization step was performed (see Data Preparation for more details).


Table 2. Column Name and Description of the Nutritional Value Dataset
Column
Description
Serving Number of servings in the recipe
Carbohydrates Total amount of carbohydrates in grams (g)
Protein Total amount of protein in grams (g)
Fat Total amount of fat in grams (g)
Cholesterol Total amoutn of cholesterol in milligrams (mg)
Sodium Total amount of sodium in milligrams (mg)
Potassium Total amount of potassium in milligrams (mg)
Fiber Total amount of fiber in grams (g)
Sugar Total amount of sugar in grams (g)
A Total amount of Vitamin A in international unit (IU)
C Total amount of Vitamin C in milligrams (mg)
Calcium Total amount of calcium in milligrams (mg)
Iron Total amount of iron in milligrams (mg)

Creating the Filipino Dishes Database

The team created an SQLite database, named ulam_nut.db, consisting of two tables that have the relevant and necessary to answer all the study's research questions. The tables are created according to the information they contain: one containing ingredient information, and the other containing nutritional value. Table 3 below summarizes the contents of the database, followed by a preview of the mentioned tables.


Table 3. Filipino Dishes Database Content Summary
Attribute
Information
Total number of Filipino dishes Filipino dishes 282 dishes
Total number of ingredients 137 ingredients
Total number of nutrients 13 nutrients
Table 4. Ingredient Information Table Preview
In [5]:
df_ing, df_nut = retrieve_data()
df_ing.head()
Out[5]:
dish_name yeast wrapper worcestershire_sauce winged_bean vinegar turmeric tomato tofu toasted_rice_powder ... shrimp pineapple water salt rice pork corn chicken beef banana
0 Sinarsahang Manok 0.0 0.0 0.0 0.0 0.0000 0.0 2.5 0.0 0.0 ... 0.0 0.0 1.5 0.0 0.0 0.0 0.0 680.389364 0.0 0.0
1 Pinoy Chicken Curry Recipe 0.0 0.0 0.0 0.0 0.0000 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 907.185819 0.0 0.0
2 Ketchup Fried Chicken 0.0 0.0 0.0 0.0 0.3125 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.000000 0.0 0.0
3 Chicken Barbecue 0.0 0.0 0.0 0.0 0.0000 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 2.0 0.0 0.0 0.0 4.000000 0.0 0.0
4 Chicken Paksiw Recipe 0.0 0.0 0.0 0.0 0.5000 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 1.0 0.0 0.0 0.0 453.592909 0.0 0.0

5 rows × 137 columns

Table 5. Nutritional Value Table Preview
In [6]:
df_nut.head()
Out[6]:
dish_name Serving Calories Carbohydrates Protein Fat Cholesterol Sodium Potassium Fiber Sugar A C Calcium Iron
0 Sinarsahang Manok 4 640 32 39 1 128 1700 1181 8 19 3000 78 114 5
1 Pinoy Chicken Curry Recipe 6 497 7 30 1 113 131 587 1 2 908 31 53 4
2 Ketchup Fried Chicken 4 643 33 4 1 1 1815 288 1 25 367 4 41 1
3 Chicken Barbecue 4 62 12 4 1 1 3056 202 1 8 163 16 21 1
4 Chicken Paksiw Recipe 4 441 21 23 1 90 594 293 1 15 228 5 44 2

Data Assumptions and Limitations

This study is not without limitations and assumptions. The researchers acknowledge that there are different interpretations for a single Filipino dish and that the ones from Panlasang Pinoy are not to be considered as the gold standard. With that said, the recipes in this study came ultimately rely on the interpretation of the website's owner and contributors. The researchers also have to impose an assumption that the nutritional value of the dish found on Panlasang Pinoy is correct, and that the conversion multipliers used in correspondingly scaling all ingredients into grams (dry), cups (wet), and pieces (bundles) are correct.

As we want to allow the algorithms to learn the underlying patterns beyond merely the protein types, we only included considered pork, chicken, and beef dishes. Further, we did not inform the algorithm about the techniques used in the dishes (e.g., boiled, braised, etc.), sources of origin (whether cultural or location-based), and whether the dish is a variant of an existing one. While these are valuable information, these are too difficult to define and find proxies for given the limited time.

Methodology

Data Preparation

During the scrapping and preparation of the Ingredient Dataset and Nutritional Value Dataset, the following steps have been done:

Data Preparation for the Ingredient Dataset

  1. Collapsing the ingredient list. Multiple ingredients of similar names and nature were initially found in the dataset. To provide an accurate interpretation of results, the team decided to group similar ingredients by looking for certain keywords using regular expression. Table 6 below lists down all the ingredients that have been collapsed (i.e., groups of size greater than 1), along with the keywords used in parsing. For the complete list of ingredients and their corresponding keywords, please see the Appendix.
Table 6. Groupings of Ingredients (Group Size > 1) with Corresponding Keywords
Ingredient Group (feature)
Keywords
pork_insides "pig’s liver", "pig’s heart", "pig’s small intestine", "bung", "pig cheeks", "pig heart", "pig kidney", "pig stomach", "pork ears", "pork large intenstine", "pork liver", "small intestine"
pork_stock "pork broth", "pork stock", "pork cube"
noodle "noodle", "pancit", "sotanghon", "misua", "miswa"
kangkong "spinach", "kangkong"
green_pea "green pea", "pigeon pea"
green_bean "green beans", "sitaw", "snake beans", "string beans", "snap pea", "snow pea"
eggplant "eggplant", "talong"
cooking_oil "cooking oil", "vegetable oil"
coconut_milk "coconut cream", "coconut milk"
clear_soda "7-up", "sprite", "clear softdrink"
sausage "chinese sauage", "chorizo"
chicken_stock "chicken broth", "chicken cube"
canned_meat "potted meat", "luncheon meat"
calamansi "calamansi", "lemon", "lime"
butter "butter", "margarine"
bok_choy "bok choy", "bokchoy"
black_soda "coke", "cola"
beef_insides "lard", "cow", "beef heart", "beef kidney", "beef large instestine", "beef liver", "beef neck bone", "beef small intestine", "bile", "tripe", "tongue", "tripe", "lengua"
beef_stock "beef cube", "beef bouillon", "bulalo", "beef broth", "beef stock"
tomato_liquid "ketchup", "tomato sauce", "tomato paste", "spaghetti sauce"
pepper "white pepper", "black pepper", "crushed pepper", "peppercorn"
chili "chili", "pepper flakes", "serrano pepper", "sili", "jalapeno", "ghost pepper", "green pepper"
bagoong "alamang", "shrimp paste", "balaw"
liquid_seasoning "liquid seasoning", "savorrich", "marinade"
chickpea "chick pea", "garbanzos"
chicken_insides "chicken gizzard", "chicken hear", "chicken liver"
pineapple_juice "pineapple juice", "juice from the canned tidbits"
pasta "spaghetti", "macaroni"
rice "rice", "sinangag"
pork "pork", "pig", "lechon"
beef "beef", "steak", "oxtail", "ox tail", "sirloin", "bistek"
banana "banana", "plantain"
  1. Dropping of low-variance ingredients. Columns whose values are 99% the same were dropped. These ingredients are pea, leeks, cucumber, turmeric, and glutinous_rice.
  1. Standardizing the ingredient amounts. All units of measurement for the ingredients underwent a two-step scaling method: (1) ensuring that all dry ingredients are measured in grams (Table 7), all wet ingredients are measured in cups (Table 8), and all ingredients that come in packs and bundles (e.g., dozen) are measured in an individual count of pieces (Table 9), and afterward (2) applying Min-Max scaling. Min-Max scaling was used to retain the integrity of the positive ingredient amounts.
Table 7. Conversion of Units of Measurement for Dry Ingredients
Unit
Equivalent Amount to 1 gram
pounds(lbs.) 0.00220462
pinch 0.355625
thumb 9.5


Table 8. Conversion of Units of Measurement for Wet Ingredients
Unit
Equivalent Amount to 1 cup
liter(L) 0.236588
quart(qt) 0.25
pint 0.5
ounce(oz.) 8
tablespoon(tbsp.) 16
teaspoon(tsp.) 48
milliliter(ml) 236.588


Table 9. Conversion of Units of Measurement for Ingredients in Packs
Unit
Equivalent Amount to 1 Piece
pack 1
can 1
bundle 5
dozen 12
  1. Performing dimensionality reduction on the ingredients. By the time the Ingredient Dataset has reached the third step, the final set of features stands at 137 ingredients, with a lot of them having a value of 0 (representing the absence of the ingredient in the dish). To reduce the number of features, and ultimately, to understand the natural tendencies of ingredients to group or separate, dimensionality reduction was performed. Since the data is sparse, singular value decomposition (SVD) was used. The two resulting singular value components that had the most straightforward interpretation, SV component 2 and SV component 5 were used later in the Descriptive Analysis where the clusters are interpreted.

Data Preparation for the Nutritional Value Dataset

  1. Filtering dishes with incomplete nutritional value information. Not all 282 dishes have available nutritional value information. After requiring that the dishes should have a non-null value for carbohydrates, fiber, sugar, fat, cholesterol, protein, vitamin A, vitamin C, calcium, and potassium, only 129 dishes were left eligible for clustering.
  1. Dropping of low-variance nutritients. Given the same variance threshold of 99%, no nutrient column was dropped.
  1. Standardizing the nutrient amounts. Table 10 shows the default unit of measurements of the nutrients when scraped from Panlasang Pinoy. The measurements also underwent a two-step scaling method: (1) ensuring that all nutrient amounts are in grams or milligrams (Table 11), and afterward (2) applying Min-Max scaling. Min-Max scaling was used to retain the integrity of the positive nutrient amounts.
Table 10. Units of Measurement for Nutrients
Nutrient Group
Nutrient
Unit of Measurement
Carbs carbohydrates, sugar, fiber grams (g)
Proteins protein grams (g)
Lipids cholesterol, fat milligrams (mg)
Vitamins vitamin A international units (IU)
Vitamins vitamin C milligrams (mg)
Minerals sodium, potassium, calcium, iron grams (g)


Table 11. Conversion of Units of Measurement for Nutrient Amounts not in Metric System
Unit
Equivalent Amount to 1 milligram
IU (Vitamin A; beta-carotine) 0.0003

Since the number of features in the Nutritional Value Dataset is only 12 to start with, no dimensionality reduction was applied. Sensitivity analysis was performed to verify the influence of performing SVD or principal component analysis, but no significant changes in the clustering results were found. However, for any plotting requirements that made use of the Nutritional Value Dataset (e.g., plotting clustering results), SVD was performed.

Clustering

Four clustering methods were explored, two representative-based and two hierarchical algorithms: k-Means, k-Medians, and hierarchical clustering using Ward's method and complete linkage. For k-Means and k-Medians, k values from 2 to 11 were explored, while for the hierarchical methods, sensitivity analysis on the dendrogram cut-off points was performed. Whenever the hierarchical clustering methods produced a big cluster that could still be further re-clustered, a separate dendrogram cut-off point for that cluster was used (Figures 2 and 3).

In choosing the best method, the researchers considered the following factors:

  1. Good internal validation metrics, inertia, silhouette score, Calinski-Harabasz, and Gap statistic (for representative-based clustering methods only).
  2. Visually-pleasing clusters projected on the first two SV components of the Nutritional Value dataset.
  3. Sensible cluster interpretations based on domain expertise.

Out of the considered methods, Ward's method satisfied all three requirements. This study only contains the codes for Ward's clustering. For the complete exploration codes, please see UlamNut_Clustering (testing).ipynb.

In [7]:
fixed_df = manual_fix_dtypes(
    df_nut,
    float_cols=['Calories', 'Carbohydrates', 'Protein', 'Fat',
                'Cholesterol', 'Sodium', 'Potassium', 'Fiber',
                'Sugar', 'A', 'C', 'Calcium', 'Iron', 'Serving'])
fixed_df = fixed_df[fixed_df['Serving'] > 0]
fixed_df = fixed_df[fixed_df['Calories'] > 0]
fixed_df.iloc[:, 1:] = fixed_df.iloc[:, 1:].div(fixed_df['Serving'], axis=0)
dropped_df_orig = drop_features(
    fixed_df, irrelevant_cols=['Serving', 'Calories'], skip_cols=None, p=0
)
dropped_df = dropped_df_orig.drop('dish_name', axis=1)
X = np.array(dropped_df, dtype=float)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
In [8]:
fixed_df_ing = fix_dtypes(df_ing)
dropped_df_orig_ing = drop_features(
    df_ing, irrelevant_cols=None, skip_cols=None, p=0
)
dropped_df_ing = dropped_df_orig_ing.drop('dish_name', axis=1)
X_ing = np.array(dropped_df_ing, dtype=float)
scaler = MinMaxScaler()
X_ing = scaler.fit_transform(X_ing)
In [9]:
q, s, p, nssd = truncated_svd(X_ing)
X_new_ing = project_svd(q, s, 60)
feature_names = dropped_df_ing.columns
weights_df_ing = pd.DataFrame(p, index=feature_names)
weights_df_ing.columns = ['SV'+str(c+1) for c in weights_df_ing.columns]
weights_df_ing = weights_df_ing.iloc[:, :60]
In [10]:
Center(""" <style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style> """)
Out[10]:
In [11]:
Z = final_clustering(
    X,
    dropped_df_orig.drop(columns='dish_name'),
    'ward',
    2.5,
    5,
    False,
    link_colors=['#DA4D2E', '#8D9F38']
)
y_pred = fcluster(Z, t=2.5, criterion='distance')
X_cluster = dropped_df_orig.assign(cluster_no=y_pred)
Figure 2. Ward's Clustering Dendrogram Results (cut-off = 2.5)
In [12]:
Z_1 = final_clustering(
    X, dropped_df_orig.drop(columns='dish_name'),
    'ward',
    2.5,
    5,
    break_biggest_cluster=True,
    link_colors=['#722502', '#CC9500', '#1F77B4', '#1F77B4']
)
y_pred_1 = fcluster(Z_1, t=4000, criterion='distance')
get_big_cluster = X_cluster.groupby(['cluster_no']).size().idxmax()
X_cluster_1 = (
    X_cluster[X_cluster['cluster_no'] == get_big_cluster]
    .drop(columns='cluster_no')
)
X_cluster_1['cluster_no'] = y_pred_1 + 3
X_cluster.loc[X_cluster_1.index] = X_cluster_1
X_cluster['cluster_no'] = X_cluster['cluster_no'] - 1
Figure 3. Zooming in on the Biggest Cluster in Figure 2 (cut-off = 4000)

Cluster Interpretation

The group interpreted the clusters by looking at the nutritional content of the dishes in their respective groups concurrently with the ingredients that make them up. This way, we were able to form insights not only on the healthiness of the dish but also on what ingredients are dominant in terms of their nutritional value.

In [13]:
svd_ing = (
    pd.DataFrame(X_new_ing, index=dropped_df_orig_ing['dish_name'])
    .reset_index()
)
svd_ing.columns = (
    ['dish_name'] + ['SV'+str(int(c)+1) for c in svd_ing.columns[1:]]
)
plotly_df = (
    pd.merge(
        svd_ing[['dish_name', 'SV1', 'SV2', 'SV3', 'SV4', 'SV5']],
        X_cluster,
        on='dish_name',
        how='right'
    )
)


def custom_names(x):
    """Custom color for the plot at hand"""

    if x == 1:  # High Cholesterol, High Sodium
        return 'Fatty and Salty Filipino Meal'
    elif x == 2:  # High Protein & Less Fat, High Vitamins & Fiber
        return 'Super Filipino Meal'
    elif x == 3:  # Bland and low vitamins and minerals
        return 'Typical Filipino Meal'
    elif x == 4:  # Bland but high vitamins and minerals
        return 'Upgraded Filipino Meal'


func = np.vectorize(custom_names)
plotly_df['cluster_names'] = func(plotly_df['cluster_no'])


def custom_color(x):
    """Custom color for the plot at hand"""

    if x == 1:
        return '#DA4D2E'  # carrot + patatas, ma-sarsa
    elif x == 2:
        return '#8D9F38'  # dahon, ma-sabaw
    elif x == 3:
        return '#722502'  # carrot + patatas, ma-sabaw
    elif x == 4:
        return '#EFC564'  # suka + toyo, matamis
    else:
        return 'rgba(249, 248, 252, 0.4)'


func = np.vectorize(custom_color)
plotly_df['plotly_colors'] = func(plotly_df['cluster_no'])
In [14]:
bar_df = (
    plotly_df.groupby('cluster_names').median().sort_values('Carbohydrates')
)
bar_df = (
    bar_df.drop(['SV1', 'SV2', 'SV3', 'SV4', 'SV5', 'cluster_no'], axis=1)
)
bar_df.columns = (
    ['Carbohydrates', 'Protein', 'Fat', 'Cholesterol', 'Sodium',
     'Potassium', 'Fiber', 'Sugar', 'Vitamin A', 'Vitamin C',
     'Calcium', 'Iron']
)

Results

Nutrient-based Clusters of Filipino Dishes

This study ended up with a roster of Filipino dishes clustered into four clusters. From the groups formed, the differences in nutritional content were evident: There was a clear separation between healthy and unhealthy dishes. With further examination, we observe that some key ingredients strongly affect the nutritional value of a dish.

In [15]:
plot_nutrients(bar_df, 'Typical Filipino Meal')
Nutrient Content of the Typical Filipino Meal Cluster
Figure 7. Nutrient Content of the Typical Filipino Meal Cluster

This cluster contains the most number of dishes. We would find the most common Filipino food served in households in this cluster, most notably the world-famous adobo. However, the food here contains the least amount of nutrients in comparison with the three other clusters. Since the typical Filipino food belongs in this cluster, Figure 7 represents the average nutritional value of Filipino Dishes. In other words, the majority of our dishes fall flat when it comes to nutritional value.

Since we have a wide variety of dishes here, it is difficult to generalize what they have in common. What is clear is that they have the strong taste and the bold flavor that is signature to the Filipino Cuisine. It is backed up by the observation that more than eighty percent of the dishes in this cluster have pepper as one of the ingredients. In a more relatable Filipino expression, these are dishes that will make us order an extra cup of rice. More than the nutritional value, Filipinos eat the dishes in this cluster because it satisfies the Pinoy palate. While there are no doubts that the crowd-favorites are here, the nutrition we get from them is dubious.

There was no single ingredient driving the numbers this low, but the apparent lack of leafy vegetables in the dishes in this cluster has a lot to do with the scarcity of nutritional value. The upside here would be that there is a lot of room for reinventions and innovations in the traditional recipes to make them healthier. But as it stands, the traditional recipes we most likely follow, give little nutrition. What the group suggests here is the integration of nutritious ingredients that do not have a strong taste. Bland ingredients could go well with the dishes that inherently have a strong taste. Some examples would be spinach, asparagus, potato, and egg. We also suggest using leaner meat when cooking. In addition to this, we can do away with the pre-made sauces and seasoning and use the more natural ones instead.

In [16]:
plot_nutrients(bar_df, 'Upgraded Filipino Meal')
Nutrient Content of the Upgraded Filipino Meal Cluster
Figure 8. Nutrient Content of the Upgraded Filipino Meal Cluster

This cluster is similar to the Typical Filipino Meal Cluster but with slightly better nutritional value. We can also see some crowd-favorites here like Menudo, Afritada, and Caldereta. Most dishes in the Upgraded Filipino Meal Cluster are tomato-based. There is an instant upgrade in terms of nutritional value because it has tomatoes. Some ingredients naturally go well with tomato-based sauces like potatoes, carrots, and various herbs, which explains the improvement in Vitamin A, Potassium, and Vitamin C.

There is still a lack of leafy vegetables in this cluster. We think this is because most healthy vegetables have a bitter taste, which does not usually work with a tomato-based sauce. It is why the vitamin C, calcium, and potassium contents of the food in this cluster trail the ones in the Super Filipino Meal cluster by a significant margin. In terms of meat quality, the type of meat is better here as it uses leaner cuts. We can see this in the uptick in the protein content and just a little uptick in fat content. This means that the dishes here do not necessarily rely on the fat content of the meat for the savory flavor. The taste of the sauce here is more dominant compared to the other ingredients.

In terms of innovation in the recipes, we observe that the recipes here are easily modifiable. We can take the classic Menudo, for example. A healthier variant of this dish can be made by using leaner pork cuts, putting more carrots, and adding more bell pepper; a healthier version without compromising the taste. Changing the meat in the dish is also something to be considered. We can have chicken, pork, and beef versions of Caldereta, Menudo, and Afritada for all we want.

In [17]:
plot_nutrients(bar_df, 'Super Filipino Meal')
Nutrient Content of the Super Filipino Meal Cluster
Figure 9. Nutrient Content of the Super Filipino Meal Cluster

With the highest amount of vitamins A and C, Calcium, Iron, Fiber, and Potassium, it is apparent that the healthiest Filipino dishes are in this cluster. It is also high in protein, with less fat and cholesterol, because the meat in the recipes usually uses chicken. The carbohydrates are the highest in this cluster because of starchy vegetables or crops like potatoes and carrots. Even though high carbohydrates in our ulams are sometimes frowned upon because we will still pair these dishes with white rice (all the carbs we need!), the starchy vegetables come with a healthy amount of potassium and vitamins. It is a good enough upside to take in exchange for additional carbohydrates. The amount of iron is also high in this cluster because of ingredients like bagoong and liver spread. Recipes here use green beans and even fruits like pineapple, which causes a high amount of dietary fiber in the cluster.

As we delved deeper into studying the Super Filipino Meal cluster, we found what is probably the healthiest common Pinoy dish ingredient in Pechay or Bok Choy. This leafy vegetable has high amounts of calcium, Vitamin A, and Vitamin C. In addition to this, it goes well with soup or sauce-based dishes such as Nilaga or Kare-Kare. Given the health benefits of Pechay, it made sense that dishes that include this in the recipe are in the healthiest cluster.

We could argue that this cluster has not only the healthiest food but also the most cost-efficient; soup-based serves more people and vegetables are generally cheaper. However, the group identifies some relevant limitations. We observed that dishes in the second cluster usually require lots of ingredients, which may have contributed to why these are not served as often as we think they should. Another factor would be that dishes here can be intimidating for people who do not like vegetables. For example, Kare-Kare is not popular with kids because of the number of vegetables present in this dish. In a typical Filipino household, kids can dictate what food will be served in family meals, and this can be an issue especially when they like food that is relatively unhealthy.

In [18]:
plot_nutrients(bar_df, 'Fatty and Salty Filipino Meal')
Nutrient Content of the Fatty and Salty Filipino Meal Cluster
Figure 10. Nutrient Content of the Fatty and Salty Filipino Meal Cluster

Dishes in this cluster are generally salty and have high cholesterol. The group attributes it to several factors, like the parts of the meat used in the dishes. For example, there are recipes here that use Pata (Pork Leg), a pork part known to be high in fat and cholesterol. Viral Pinoy posts about having high blood pressure or even heart attacks are usually associated with dishes like Crispy Pata. The high protein in this cluster is due to the high amount of meat in the dish relative to the other ingredients and not necessarily about the quality of the protein itself. It explains why the fat and cholesterol amounts are through the roof, too. High sodium, on the other hand, could be associated with the use of salt and liquid seasonings in most dishes. Some recipes also require processed goods like corned beef, which has a high sodium content.

In general, dishes in this cluster are unhealthy. There is an unusually high amount of cholesterol, fat, and sodium, which could cause health issues in the long run. Luckily, the usual Filipino food that we eat is not in this cluster. The food here is cooked and served occasionally. In terms of recommendations, the group did not come up with good substitutes or even twists to make the food here healthier. The unhealthiness of the dishes here is mainly because of the type of meat used and the seasonings that come with it. There is just no replacing Pata in Crispy Pata; it makes this Filipino food sinfully good. The same goes with most dishes here; this is not your typical recipe where you can change the meat content, but it essentially stays the same dish.

Not all is lost about including this in a diet. Even though the food in this cluster, in general, is served occasionally, they fit in some diet programs. The ketogenic diet is becoming more and more popular, and dishes in the first cluster are commonly included in this type of diet. Essentially, Keto requires a very high fat intake and ultra-low carbohydrate intake. Pata dishes are the best candidates here. As for how healthy it is, experts seem to have conflicting ideas and takes on Keto. On the one hand, some would argue that it is unhealthy and unsustainable. On the other hand, some would be content and happy with the results, which usually include a drastic fat loss.

Projecting Nutrient-based Clusters on Ingredient SV Components

Instead of projecting the clustering results on the nutrient components, projecting them on the ingredient components allows us to better understand the nutritional value associated with different ingredient combinations. That is, by looking at the nutritional value of the ingredients' resulting dishes, we can identify ingredient combinations that usually result in healthy dishes, which ultimately allows us to uncover healthier alternative ingredients for otherwise unhealthy dishes.

For readability and ease of understanding, we will only project the results on a two-dimensional ingredient space. With interpretability as the researcher's utmost priority, SV2 and SV5 of the Ingredient Dataset, which cumulatively explains 9.81% of the data's variation, were chosen. Figures 4 and 5 below show how the researchers assigned meanings to these components.

In [19]:
def custom_color(x):
    """Custom color for the plot at hand"""

    if x < 0:
        return '#722502'
    if x > 0:
        return '#DA4D2E'


fill_fn = np.vectorize(custom_color)

plot_svd_zoomed(weights_df_ing,
                zoom_on='dominant',
                num_comp=2,
                num_ing=12,
                fill_fn=np.vectorize(custom_color),
                manual_fill_values=['#8D9F38', '#DA4D2E'])
Ingredients with heavy negative loadings represent pork and garlic combo, while ingredients with heavy positive loadings represent lean meat and crops combo
Figure 4. Interpretation of SV2 from the Ingredient Dataset

The dominant ingredients in SV2, from both the positive and the negative end of the spectrum, include a kind of meat and a variety of vegetables. Particularly, ingredients with heavy negative loadings include pork and flavoring plants and vegetables (garlic, pepper, bay leaf), while ingredients with heavy positive loadings include chicken and crop vegetables (potatoes and carrots). At first glance, one could say that the ingredients with positive loadings would generally result in healthier dishes, however, ingredients with negative loadings also have a fair share of vegetables such as kangkong and eggplant. This is a glimpse of how attempting to associate nutritional value by merely looking at the ingredients is not as straightforward as one would expect. These ingredients, or combinations of them, could either define the overall nutritional value of the dish or at times be a game-changer to its nutritional content. In the end, it is important to look at the resulting dishes from these combinations of ingredients to better the extent of the nutritional value they offer.

In [20]:
def custom_color(x):
    """Custom color for the plot at hand"""
    if x < 0:
        return '#5E6A25'
    if x > 0:
        return '#8D9F38'


fill_fn = np.vectorize(custom_color)

plot_svd_zoomed(weights_df_ing,
                zoom_on='close to zero',
                num_comp=5,
                num_ing=15,
                fill_fn=np.vectorize(custom_color),
                manual_fill_values=['#722502', '#722502'])
In [21]:
def custom_color(x):
    """Custom color for the plot at hand"""
    if x < 0:
        return '#5E6A25'
    if x > 0:
        return '#8D9F38'


fill_fn = np.vectorize(custom_color)

plot_svd_zoomed(weights_df_ing,
                zoom_on='dominant',
                num_comp=5,
                num_ing=12,
                fill_fn=np.vectorize(custom_color),
                manual_fill_values=['#F4903E', '#8D9F38'])
Ingredients with heavy negative loadings are dominated with peanuty-y flavor, while ingredients with heavy positive loadings are dominated with sour and zesty flavors
Ingredients with loadings close to zero are a mix of salty and sweet
Figure 5. Interpretation of SV5 from the Ingredient Dataset

SV5 is a bit more complex than the previous component. This time, the whole range of the spectrum, not just its ends, was inspected before an appropriate interpretation was assigned. The first part of Figure 5 shows the dominant ingredients in SV5, from both the positive and the negative end of the spectrum, while the second half shows the ingredients that are close to 0 (i.e., possibly pointing away from SV5). When pieced together, this shows the flavor range of the Filipino dishes: from nutty flavors (peanut and annatto) to salty (soy sauce, hoisin sauce) or sweet (honey, pineapple), and to sour and zesty (sinigang mix, radish).

Using SV2, which represents the meat and vegetable range of Filipino dishes, and SV5, which represents the flavor range of Filipino dishes, the following two-dimensional space was created.

Filipino dishes were clustered into 4 main groups based on
their
carbs, proteins, lipids, vitamins, and minerals content.

Each point in the scatter plot represents a Filipino dish. The different colors represent the different groupings or clusters of the dishes based on nutrient content. The horizontal position indicates indicates the kinds of meat and vegetables in the dish (from pork and garlic/green vegetables combo to lean meat and crops combo), while the vertical position indicates the dominating flavor of the dish (from nutty, sweet, salty, and sour flavors).

In [22]:
dict_df = plotly_df[['cluster_names', 'plotly_colors']].drop_duplicates()
cmap = dict(zip(dict_df['cluster_names'], dict_df['plotly_colors']))

plotly_clusters(plotly_df, y_ref=0, x_ref=0, c_map=cmap)
Figure 6. Nutrient-based Clusters Projected on the Ingredient Space

It can be observed almost all clusters are clumped together in the sweet and salty flavor range and are spread across the meat and vegetable range. This suggests that regardless of the combination of meat and vegetables found in a Filipino dish, as long as it falls in the sweet and salty flavor range, will almost always fall under the Typical Filipino Meal, the Upgraded Filipino Meal, or the Fatty and Salty Filipino Meal. Interestingly, the farther the dish places towards the left side of the plot, that is, the more they make use of pork and garlic combination, the more likely it is to be Fatty and Salty. In fact, out of all the dishes that use lean meat, only Chicken Hamonado and Pinoy Fried Chicken Recipe were tagged as Fatty and Salty.

Another interesting observation is that there appears to be a linear trend for the meat and vegetable range and the flavor range among the Super Filipino Meal dishes. That is, dishes that make use of the pork and garlic combination, as long as they have a nutty flavor, will likely fall under Super Filipino Meal. Perhaps this is because nutty sauces are often accompanied by vitamin-enriched vegetables such as pechay or bokchoy. Interestingly, Sinigang dishes, which fall under the second quadrant (dishes that make us of pork but have a sour flavor), are spread out across nutrient clusters. This suggests that the current variants of Sinigang were born of experimentation on the flavor and texture (e.g., deep-frying the meat) while giving only little consideration to its nutrient content.

Finally, there are currently no dishes in the first quadrant (dishes that make use of lean meat and crops that are sour). While this could serve a room for innovation from a flavor perspective, this also opens up avenues for innovations from a nutrition standpoint. That is since Super Filipino Meals follow a linear trend in this 2D ingredient space, the researchers wonder if had there been existing Filipino dishes in the first quadrant, would they naturally belong to Super Filipino Meals as well?

Discussion and Recommendations

In the food business, the word fusion is for a curious and intriguing combination of dishes and cooking techniques from different cuisines that work together. However, given how our cuisine has combined different foreign cooking styles and recipes with our local techniques and native ingredients, we think that it is a fusion in itself. The diversity, creativeness, and boldness of our food were evident in the number of the different recipes in the data set and the ingredients that make them up. This variety translated to a wide range of nutritional values in our food, too. As the group focused on nutrition-based clustering in this study, we were able to produce insights on the healthiness of the Filipino dishes, the common ingredients that give a dish a high nutritional value, and how a data-driven approach to studying our food could have both practical and business use cases.

We presented general insights on each of the clusters. Some of the notable observations we had would be that the most common Filipino dishes fall under the Typical Filipino Meal Cluster, which, unfortunately, has the lowest nutritional value among all the clusters. The crowd-favorites are focused on satisfying the Pinoy palate more than providing the necessary nutrition. As for why the dishes here are the most common other than the taste, it may be because the dishes here are simple to make in that they require the fewest ingredients. Notable in this cluster is the lack of leafy vegetables, which could be incorporated in the classic recipes because the food here has strong flavors. The healthy greens are usually bland with a touch of bitterness, which could be balanced out by the dominant flavors present in the dishes here.

Another interesting observation was that all but one variant of Adobo was in this cluster. The Adobo with Tokwa and Tausi dish was in the Fatty and Salty Filipino Meal Cluster because the recipe included Garlic Fried Rice Adobo. The latter required additional adobo sauce, which led to the dish having a higher-than-expected sodium content than a usual Adobo dish. There was no actual variation in the nutrients of the adobo dishes even if the meat was switched out or additional ingredients such as soft drinks were included in the recipe. This means that the traditional adobo dish is light when it comes to nutritional value. Sinigang variants, on the other hand, covered all the clusters. This shows that even though Sinigang sounds like a standard Filipino dish, mixing up its ingredients can drastically affect the nutritional value. The usage of different pork parts affects the protein, fat, and cholesterol contents, while the amount and variety of vegetables affect the remaining micronutrients.

The healthy Super Filipino Meal Cluster is composed of dishes with green leafy vegetables, most notably Pechay. There are considerably fewer dishes in this cluster compared to the Typical Filipino Meal Cluster, which means that there is not much variety just yet in this cluster. This said, there is room for creating variations among the healthy dishes, but it certainly requires vegetables to be included in the recipe. Making good-tasting dishes that require fewer ingredients can also help in convincing Filipinos to eat healthier. In general, we need to explore using vegetables in more recipes because it was clear that they are the primary drivers for nutrients such as vitamins and other minerals. The same can be said about the Upgraded Filipino Meal Cluster. Dishes here were mostly tomato-based. It includes a few vegetables and crops like carrots, potatoes, and herbs which causes an increase in nutritional value. The initial purpose of this study was to come up with practical applications such as identifying a cost-efficient combination of dishes to serve on occasions, creating a good menu with the right balance of contrasting flavors, and reinventing traditional recipes, which would all be useful for the typical Filipino family. However, as the team did analysis and clustering, we realized that some business use cases will inevitably come out of this study. This study could help food businesses, especially those branding themselves as healthy, create a menu that maximizes nutritional value and profit. The healthy dishes that belong to the Super Filipino Meal Cluster could headline their food choices. They can also reinvent the unhealthy crowd-favorites to a healthier version. Businesses could use both ingredient-based and nutrition-based clustering as their guide in their food research and development.

A good extension of this project would be expanding the dataset to include other dishes and other recipe sources. There are hundreds of dishes in the Philippines, each prepared uniquely depending on the place of origin. Some may be similar, but some can be drastically different (e.g., Luzon's Pochero vs. Mindanao Pochero). Having a larger data set that accounts for these similarities and differences would lead to better research on the nutrients and ingredients of Filipino Dishes. There could also be additional business use cases such as starting a food business with themes like using only one base dish (e.g., different versions of Kare-Kare) or even using just one base ingredient (e.g., Lapu-Lapu ala Su-Tu-Kil). Adobo Connection is one example of this kind of business, although they started having other dishes on their menu during the last few years. From a data-driven perspective, we can see that from the hundreds of recipes and thousands of ingredients, there is so much room for innovation in this space. With data science, there is infinite potential in the food industry.

Appendix

Table 1. Groupings of Ingredients with Corresponding Keywords
Ingredient Group (feature)
Keywords
yeast "yeast"
wrapper "wrapper"
worcestershire_sauce "worcestershire"
winged_bean "winged bean"
vinegar "vinegar"
turmeric "tumeric"
tomato "tomato"
tofu "tofu"
toasted_rice_powder "toasted rice powder"
taro "taro"
sweet_potato "sweet potato"
sugar "sugar"
star_anise "star anise"
squash "kalabasa", "squash"
soy_sauce "soy sauce"
sinigang_mix "sinigang"
shortening "shortening"
sesame_oil "sesame oil"
scallion "scallion"
sayote "sayote"
salted_egg "salted egg"
safflower_oil "safflower oil"
raisins "raisins"
radish "radish"
potato "potato"
pork_insides "pig’s liver", "pig’s heart", "pig’s small intestine", "bung", "pig cheeks", "pig heart", "pig kidney", "pig stomach", "pork ears", "pork large intenstine", "pork liver", "small intestine"
pork_fat "pork fat"
pork_stock "pork broth", "pork stock", "pork cube"
pork_blood "pork blood"
pork_and_beans "pork and beans"
pie_crust "pie"
pickle "pickle", "relish"
pepper_leaf "pepper leaves"
pechay "pechay"
peanut "peanut"
patola "patola"
parsley "parsley"
paprika "paprika"
papaya "papaya"
oyster_sauce "oyster"
onion "onion"
olive_oil "olive oil"
olive "olive"
okra "okra"
nutmeg "nutmeg"
noodle "noodle", "pancit", "sotanghon", "misua", "miswa"
mushroom "mushroom"
munggo "mung"
mirin "mirin"
mayonnaise "mayonnaise"
malunggay "malunggay"
liver_spread "liver"
lemongrass "lemongrass"
leeks "leeks"
lechon_sauce "lechon"
kasubha "kasubha"
kangkong "spinach", "kangkong"
jicama "jicama"
jackfruit "jackfruit"
ice "ice"
hotdog "hotdog"
hot_sauce "hot"
honey "honey"
hoisin_sauce "hoisin"
green_pea "green pea", "pigeon pea"
green_bean "green beans", "sitaw", "snake beans", "string beans", "snap pea", "snow pea"
glutinous_rice "glutinous rice"
ginger "ginger"
ginataang_gulay_mix "ginataang"
garlic "garlic"
flour "flour"
eggplant "eggplant", "talong"
egg "egg"
curry_powder "curry"
cucumber "cucumber"
cream "cream"
cooking_wine "wine"
cooking_oil "cooking oil", "vegetable oil"
coconut_water "coconut water"
coconut_milk "coconut cream", "coconut milk"
coconut_meat "coconut meat"
clear_soda "7-up", "sprite", "clear softdrink"
cinnamon "cinnamon"
sausage "chinese sauage", "chorizo"
chicken_stock "chicken broth", "chicken cube"
chicharon "chicharon"
cheese "cheese"
celery "celery"
carrot "carrot"
canned_meat "potted meat", "luncheon meat"
calamansi "calamansi", "lemon", "lime"
cabbage "cabbage"
butter "butter", "margarine"
broccoli "broccoli"
bread "bread"
bok_choy "bok choy", "bokchoy"
black_soda "coke", "cola"
black_bean "black bean"
beer "beer"
beef_insides "lard", "cow", "beef heart", "beef kidney", "beef large instestine", "beef liver", "beef neck bone", "beef small intestine", "bile", "tripe", "tongue", "tripe", "lengua"
beef_stock "beef cube", "beef bouillon", "bulalo", "beef broth", "beef stock"
bay_leaf "bay"
bamboo_shoots "bamboo shoots"
baking_powder "baking powder"
annatto "annatto"
ampalaya "ampalaya"
adobo_sauce "adobo"
achiote "achiote"
tomato_liquid "ketchup", "tomato sauce", "tomato paste", "spaghetti sauce"
banana_flower "blossom"
pepper "white pepper", "black pepper", "crushed pepper", "peppercorn"
chili "chili", "pepper flakes", "serrano pepper", "sili", "jalapeno", "ghost pepper", "green pepper"
bell_pepper "bell pepper"
bagoong "alamang", "shrimp paste", "balaw"
liquid_seasoning "liquid seasoning", "savorrich", "marinade"
chickpea "chick pea", "garbanzos"
chicken_insides "chicken gizzard", "chicken hear", "chicken liver"
cornstarch "cornstarch"
corned_beef "corned beef"
fish_sauce "fish sauce"
pineapple_juice "pineapple juice", "juice from the canned tidbits"
shrimp_cube "shrimp cube"
watermelon "watermelon"
milk "milk"
pea "pea"
pasta "spaghetti", "macaroni"
shrimp "shrimp"
pineapple "pineapple"
water "water"
salt "salt"
rice "rice", "sinangag"
pork "pork", "pig", "lechon"
corn "corn"
chicken "chicken"
beef "beef", "steak", "oxtail", "ox tail", "sirloin", "bistek"
banana "banana", "plantain"
Figure 1. Complete List of Dishes per Nutrient-based Clusters
In [23]:
clusters = X_cluster.groupby('cluster_no')['dish_name'].unique()

for k, v in enumerate(clusters):
    print(f'Cluster {k+1}')
    print(v)
    print(' ')
Cluster 1
['Pinoy Fried Chicken Recipe' 'Chicken Hamonado' 'Air Fryer Crispy Pata'
 'Pork Dinuguan Isaw at Tenga' 'Sweet Pata Asado' 'Turbo Crispy Liempo'
 'Pork Adobo with Tokwa and Tausi and Garlic Fried Rice Adobo'
 'Sinigang na Pata with Gabi' 'Crispy Pata Recipe'
 'Rellenong Talong Recipe (Stuffed Eggplant)'
 'Corned Beef Lomi Batangas Style' 'Tapsilog Recipe']
 
Cluster 2
['Sinarsahang Manok' 'How to Cook Chicken Kare-Kare sa Gata'
 'Pininyahang Manok sa Gata (Pineapple Chicken in Coconut Milk)'
 'Chicken Sotanghon Soup with Malunggay and Sayote' 'Chicken Mami Recipe'
 'Nilagang Manok' 'Chicken Pochero' 'Chicken Kare Kare Recipe'
 'Chicken Mami' 'Crispy Kare-kare Recipe' 'Pata Pochero'
 'How To Make Pata Kare-Kare' 'Crispy Liempo Sinigang Rice'
 'Pochero with Pork and Beans' 'Spicy Pork Kaldereta'
 'Pork Kare Kare Recipe' 'Bulalo Kare-kare'
 'Kare-Kare Recipe (Beef Tripe)' 'How to Cook Kare Kare']
 
Cluster 3
['Pinoy Chicken Curry Recipe' 'Ketchup Fried Chicken' 'Chicken Barbecue'
 'Chicken Paksiw Recipe' 'Chicken Binagoongan Recipe'
 'Pininyahang Manok Recipe (Pineapple Chicken)'
 'Chicken Feet in Oyster Sauce with Salted Black Beans'
 'Killer Chicken Adobo' 'Adobong Manok sa Gata' 'Chicken Lollipop Recipe'
 'Quick and Easy Fried Chicken' 'Ginataang Manok with Papaya'
 'Easy Chicken Adobo Recipe' 'Pork and Chicken Adobo Recipe'
 'Chicken Lumpia' 'Sarciadong Manok' 'Chicken Sotanghon Soup' 'Inasal'
 'Filipino Chicken Adobo Recipe' 'Pinoy Style Chicken Macaroni Salad'
 'Tinolang Manok Recipe' 'Binagoongan Bagnet with Talong'
 'Nagmamantikang Pork Adobo' 'Binagoongang Talong' 'Siopao Asado Recipe'
 'Bicol Express Gising-gising Recipe' 'Bicol Express Recipe'
 'Easy Lechon Paksiw' 'How to Cook Humba (Bisaya Version)'
 'Tokwa at Baboy Humba (Pork Humba with Tofu)' 'Spare Ribs Hamonado'
 'Coke Pork Adobo' 'Sprite Pork Adobo Recipe' 'Crispy Pork Binagoongan'
 'Pork Siomai Recipe' 'Sinigang na Lechon' 'Sizzling Crispy Sisig'
 'Lechon Belly Roll' 'Pinoy Pork and Chicken Curry' 'Hamonadong Baboy'
 'Easy Liempo Inihaw' 'Dinuguan Recipe' 'Lechon Kawali Recipe'
 'Pinoy Pork Barbecue' 'Sinigang na Baboy with Gabi'
 'Crispy Pork Belly Chicharon' 'Pork Adobo' 'Binagoongan Recipe'
 'Deep Fried Siomai' 'How to Cook Embutido' 'How to Cook Bicol Express'
 'Skinless Longganisa Recipe' 'Filipino Style Spaghetti Recipe'
 'Pork Dinakdakan Recipe' 'Lechon Kawali' 'Pork Bicol Express'
 'Liempo Estofado Recipe' 'KBL (Kadyos, Baboy, at Langka) Recipe'
 'Pochero Recipe' 'Beef Salpicao' 'Nilagang Baka Recipe' 'Burger Steak'
 'Beef Pares sa Kanto' 'How to Make Goto Lamang Loob and Tokwa’t Baboy'
 'Kinamatisang Baka' 'Beef Asado Recipe' 'Saucy Beef with Broccoli'
 'Beef Hinalang' 'Beef Kaldereta sa Gata with Peanut Butter'
 'Ampalaya con Carne Recipe' 'Bistek Tagalog Recipe – Pinoy Beefsteak'
 'Papaitan Recipe' 'Beef Mechado Recipe']
 
Cluster 4
['Chicken Igado' 'Pineapple Chicken Afritada' 'Chicken Caldereta'
 'Chicken Pancit Recipe' 'Creamy Chicken Sopas'
 'How to Cook Chicken Pastel' 'Ginataang Manok with Kalabasa Recipe'
 'Pata Estofado' 'Pork Menudo Recipe' 'Pork Caldereta Recipe'
 'Pork Menudo sa Gata Recipe' 'Pork Menudo sa Gata' 'Sinantomas'
 'Spareribs Kaldereta' 'Crispy Dinakdakan Recipe' 'Meaty Spaghetti'
 'Paklay Recipe' 'Pork Lomi' 'Pork Kaldereta sa Gata' 'Sinigang' 'Sisig'
 'Filipino Pork Menudo Recipe' 'Sinigang na Buntot ng Baboy with Gabi'
 'Beef Papaitan' 'Beef Kaldereta Recipe']
 
In [ ]: